Goal

Goal of this project is to find characteristics of texts from 3 popular horror authors, identify similarities and differences in their texts in the spooky dataset. Data consists of excerpts of texts written by Edgar Allan Poe (EAP), HP Lovecraft (HPL), and Mary Wollstonecraft Shelley (MWS).

Load packages and read the data

Setup the libraries if not already installed

packages.used <- c("ggplot2", "plotrix", "waffle", "dplyr", "tibble", "tidyr",  "stringr", "tidytext", "topicmodels", "wordcloud", "plotly", "webshot", "htmlwidgets", "reshape2")

# check packages that need to be installed.
packages.needed <- setdiff(packages.used, intersect(installed.packages()[,1], packages.used))

# install additional packages
if(length(packages.needed) > 0) {
  install.packages(packages.needed, dependencies = TRUE, repos = 'http://cran.us.r-project.org')
}


library(ggplot2)
library(dplyr)
library(tibble)
library(tidyr)
library(stringr)
library(tidytext)
library(topicmodels)
library(wordcloud)
library(plotrix)
library(waffle)
library(plotly)
library(webshot)
library(htmlwidgets)
library(reshape2)

Read in the data

spooky.csv in data folder, and this Rmd inside doc folder.

spooky <- read.csv('../data/spooky.csv', as.is = TRUE)

Overview of the dataset

Take a look of first few rows and dimension of the dataset

head(spooky, 3)
##        id
## 1 id26305
## 2 id17569
## 3 id11008
##                                                                                                                                                                                                                                      text
## 1 This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.
## 2                                                                                                                                                                 It never once occurred to me that the fumbling might be a mere mistake.
## 3                                In his left hand was a gold snuff box, from which, as he capered down the hill, cutting all manner of fantastic steps, he took snuff incessantly with an air of the greatest possible self satisfaction.
##   author
## 1    EAP
## 2    HPL
## 3    EAP
dim(spooky)
## [1] 19579     3

How many texts do each author have in the dataset?

num_texts <- table(spooky$author)
num_texts
## 
##  EAP  HPL  MWS 
## 7900 5635 6044

Plot composition of number of texts from 3 authors in pie chart, display counts and percentages

lbls <- paste(names(num_texts), '\n', num_texts, '\n', round(num_texts/sum(num_texts) * 100, 1), '%', sep = '')
pie3D(num_texts, labels = lbls, explode = 0.05, labelcex = 0.8)

Writing Style

Do some authors use more questions in the texts than others?

  • Count number of question marks in texts for spooky
  • Add a field num_qns for the counts
  • Wrangle data to show counts for each author
  • Plot a waffle chart to see comparison of use of questions in texts among 3 authors.
str_count(spooky, '\\?')
## [1]    0 1098    0
dat1 <- mutate(spooky, num_qns = str_count(spooky$text, '\\?'))
dat2 <- aggregate(dat1$num_qns, by = list(Author = dat1$author), FUN = sum)
dat2
##   Author   x
## 1    EAP 510
## 2    HPL 169
## 3    MWS 419
waffle(c('EAP' = dat2[1, 2], 'HPL' = dat2[2, 2], 'MWS' = dat2[3, 2]), rows = 20, size = 0.5, title = 'Count of Questions in Texts by Authors', xlab = '(1 square == 1 question)')

Sentiment analysis

Positive and negative emotional content comparison in authors’ text

Apply sentiment analysis using bing lexicon

get_sentiments("bing")
## # A tibble: 6,788 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faced     negative 
##  2 2-faces     negative 
##  3 a+          positive 
##  4 abnormal    negative 
##  5 abolish     negative 
##  6 abominable  negative 
##  7 abominably  negative 
##  8 abominate   negative 
##  9 abomination negative 
## 10 abort       negative 
## # ... with 6,778 more rows
tidy_text <- unnest_tokens(spooky, word, text)
tidy_text_sentiment <- tidy_text %>% inner_join(get_sentiments('bing'))
## Joining, by = "word"
head(tidy_text_sentiment, 10)
##         id author        word sentiment
## 1  id26305    EAP     dungeon  negative
## 2  id26305    EAP   perfectly  positive
## 3  id17569    HPL     mistake  negative
## 4  id11008    EAP        gold  positive
## 5  id11008    EAP   fantastic  positive
## 6  id11008    EAP incessantly  negative
## 7  id11008    EAP    greatest  positive
## 8  id27763    MWS      lovely  positive
## 9  id27763    MWS     fertile  positive
## 10 id27763    MWS       happy  positive
dat3 <- table(tidy_text_sentiment$sentiment, tidy_text_sentiment$author)
dat3
##           
##             EAP  HPL  MWS
##   negative 7203 7605 8150
##   positive 6144 3731 6799
pyramid.plot(dat3[1,c(1:3)], dat3[2,c(1:3)], top.labels = NULL, show.values = TRUE, ndig = 0, main = 'Author by Sentiments', unit = c('Negative', 'Positive'), ppmar = c(4, 4, 4, 4), laxlab = FALSE, raxlab = FALSE)
## [1] 5.1 4.1 4.1 2.1
legend('topright', legend = c("EAP", "HPL", "MWS"), col = c("red", "green", "blue"), lty = 1, bty = 'n', lwd = 8, cex = 0.8, horiz=TRUE)

What are the top 100 positive and negative words do the authors use?

#comparison cloud
tidy_text %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("darkgreen", "purple"),
                   max.words = 100)
## Joining, by = "word"

  • Aggregate number of negative and positive words used by each author
  • Plot a bubble chart to show sentiments and num_qns by authors
dat4 <- as.data.frame.matrix(table(tidy_text_sentiment$author, tidy_text_sentiment$sentiment))
dat4 <- setNames(cbind(rownames(dat4), dat4, row.names = NULL), c('Author', 'negative', 'positive'))
dat4
##   Author negative positive
## 1    EAP     7203     6144
## 2    HPL     7605     3731
## 3    MWS     8150     6799
#rename column name
colnames(dat2)[which(names(dat2) == 'x')] <- 'num_qns'
dat2
##   Author num_qns
## 1    EAP     510
## 2    HPL     169
## 3    MWS     419
#join tables by author
dat5 <- inner_join(dat2, dat4, by = 'Author')
## Warning: Column `Author` joining character vector and factor, coercing into
## character vector
dat5
##   Author num_qns negative positive
## 1    EAP     510     7203     6144
## 2    HPL     169     7605     3731
## 3    MWS     419     8150     6799
#   Author num_qns negative positive
# 1    EAP     510     7203     6144
# 2    HPL     169     7605     3731
# 3    MWS     419     8150     6799

#plot bubble chart: Sentiments and Num of Questions per Author
p <- plot_ly(dat5, x = ~positive, y = ~negative, size = ~num_qns, color = ~Author, 
    type = 'scatter', mode = 'markers', marker = list(opacity = 0.5)) %>% 
    layout(title = '<b>Sentiments and Num of Questions per Author</b>',
           xaxis = list(title = '<b>Positive words</b>', showgrid = FALSE),
           yaxis = list(title = '<b>Negative words</b>', showgrid = FALSE),
           showlegend = FALSE) %>% 
    add_annotations(
            text = paste(dat5$Author, '\n', dat5$num_qns),
            xref = "x",
            yref = "y",
            showarrow = TRUE,
            arrowsize = 0.5,
            ax = 40,
            ay = -60)

p
#export(p, file = '/Users/qinqingao/Documents/GitHub/spring2018-project1-ginnyqg/figs/Bubble.png')
  • With the same method, explore relationship between authors’ use of questions in sentences and toward their total volume of texts.
#sentence length
spooky$sen_length <- str_length(spooky$text)

dat6 <- mutate(spooky, sen_length = spooky$sen_length)
dat7 <- aggregate(dat6$sen_length, by = list(Author = dat6$author), FUN = sum)

#rename column name
colnames(dat7)[which(names(dat7) == 'x')] <- 'sen_length'
dat7
##   Author sen_length
## 1    EAP    1123585
## 2    HPL     878178
## 3    MWS     916632
#join tables by author
dat8 <- inner_join(dat7, dat5, by = 'Author')
dat8
##   Author sen_length num_qns negative positive
## 1    EAP    1123585     510     7203     6144
## 2    HPL     878178     169     7605     3731
## 3    MWS     916632     419     8150     6799
new_num_texts <- melt(num_texts)
colnames(new_num_texts) <- c('Author', 'num_text')
new_num_texts
##   Author num_text
## 1    EAP     7900
## 2    HPL     5635
## 3    MWS     6044
dat9 <- inner_join(new_num_texts, dat8, by = 'Author')
## Warning: Column `Author` joining factor and character vector, coercing into
## character vector
dat9
##   Author num_text sen_length num_qns negative positive
## 1    EAP     7900    1123585     510     7203     6144
## 2    HPL     5635     878178     169     7605     3731
## 3    MWS     6044     916632     419     8150     6799
#plot bubble chart: Num of Texts, Questions, Sentence Length per Author
p2 <- plot_ly(dat9, x = ~num_text, y = ~num_qns, size = ~sen_length, color = ~Author, 
    type = 'scatter', mode = 'markers', marker = list(opacity = 0.5)) %>% 
    layout(title = '<b>Num of Texts, Questions, Sentence Length per Author</b>',
           xaxis = list(title = '<b>Num of Texts</b>', showgrid = FALSE),
           yaxis = list(title = '<b>Num of Questions</b>', showgrid = FALSE),
           showlegend = FALSE) %>% 
    add_annotations(
            text = paste(dat9$Author, '\n', prettyNum(dat9$sen_length, big.mark = ',', scientific = FALSE)),
            xref = "x",
            yref = "y",
            showarrow = TRUE,
            arrowsize = 0.5,
            ax = 40,
            ay = -60)


p2
#export(p2, file = '/Users/qinqingao/Documents/GitHub/spring2018-project1-ginnyqg/figs/Bubble_num_text_qns_sent.png')